#!/usr/bin/env python
##
#  Project: Kelindo - Unsupervised Music Organizer  
#  Author: Nicola Bicocchi <nicola.bicocchi@gmail.com>, Nicolo' Torreggiani <nicolo.torreggiani@gmail.com>, Gianluca Puglia <gianluca.puglia@gmail.com>, Matteo Senardi <pualien@gmail.com>
#  Copyright: 2013 Nicola Bicocchi, Nicolo' Torreggiani, Gianluca Puglia, Matteo Senardi
#  License: GPL-2+
#  This program is free software; you can redistribute it and/or modify it
#  under the terms of the GNU General Public License as published by the Free
#  Software Foundation; either version 2 of the License, or (at your option)
#  any later version.
# 
#  This program is distributed in the hope that it will be useful, but WITHOUT
#  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
#  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
#  more details.
# 
# On Debian GNU/Linux systems, the full text of the GNU General Public License
# can be found in the file /usr/share/common-licenses/GPL-2.
##

import re

class BagOfWords:
    """ Implementing a bag of words, words corresponding with their frequency of usages in a "document"
    for usage by the Document class, DocumentClass class and the Pool class."""
    
    def __init__(self):
        self.__number_of_words = 0
        self.__bag_of_words = {}
        
    def __add__(self, other):
        """ Overloading of the "+" operator to join two BagOfWords """
        erg = BagOfWords()
        sum = erg.__bag_of_words
        for key in self.__bag_of_words:
            sum[key] = self.__bag_of_words[key]
            if key in other.__bag_of_words:
                sum[key] += other.__bag_of_words[key]
        for key in other.__bag_of_words:
            if key not in sum:
                sum[key] = other.__bag_of_words[key]
        return erg
    
    def __repr__(self):
        return repr(self.__bag_of_words)
        
    def add_word(self, word):
        """ A word is added in the dictionary __bag_of_words"""
        self.__number_of_words += 1
        if word in self.__bag_of_words:
            self.__bag_of_words[word] += 1
        else:
            self.__bag_of_words[word] = 1
        return
            
    def add_string(self, string):
        """ All the non numeric words are added """
        for word in re.findall(r'\w+', string):
            try:            
                int(word)
            except ValueError:
                self.add_word(word.lower())
        return
    
    def len(self):
        """ Returning the number of different words of an object """
        return len(self.__bag_of_words)
    
    def get_words(self):
        """ Returning a list of the words contained in the object """
        return self.__bag_of_words.keys()
    
    def bag_of_words(self):
        """ Returning the dictionary, containing the words (keys) with their frequency (values)"""
        return self.__bag_of_words
        
    def word_freq(self, word):
        """ Returning the frequency of a word """
        if word in self.__bag_of_words:
            return self.__bag_of_words[word]
        else:
            return 0
        
    def similarity(self, other_bag):
        """ Returning a similarity measure between two bags """
        set1 = set(self.get_words())
        set2 = set(other_bag.get_words())
        union = set1.union(set2)
        intersection = set1.intersection(set2)
        similarity = len(intersection) / float(len(union)) 
        return similarity
